0.1 Импорт

Импорт phyloseq объекта и библиотек
Датасет - хроносерия разложения соломы - от фактор Day 10ти уровней

library(phyloseq)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggpubr)
library(ampvis2)
library(ANCOMBC)
library(heatmaply)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust vegan
## 
## ======================
## Welcome to heatmaply version 1.3.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## 
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## 
## Attaching package: 'WGCNA'
## 
## The following object is masked from 'package:stats':
## 
##     cor
library(compositions)
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
## 
## 
## Attaching package: 'compositions'
## 
## The following object is masked from 'package:WGCNA':
## 
##     cor
## 
## The following object is masked from 'package:heatmaply':
## 
##     normalize
## 
## The following objects are masked from 'package:stats':
## 
##     anova, cor, cov, dist, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, norm, scale, scale.default
library(igraph)
## 
## Attaching package: 'igraph'
## 
## The following object is masked from 'package:compositions':
## 
##     normalize
## 
## The following object is masked from 'package:heatmaply':
## 
##     normalize
## 
## The following object is masked from 'package:plotly':
## 
##     groups
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
ps.f <- readRDS("psf2")

0.2 functions

#phyloseq object to ampvis2 object
#https://gist.github.com/KasperSkytte/8d0ca4206a66be7ff6d76fc4ab8e66c6

phyloseq_to_ampvis2 <- function(physeq) {
  #check object for class
  if(!any(class(physeq) %in% "phyloseq"))
    stop("physeq object must be of class \"phyloseq\"", call. = FALSE)
  
  #ampvis2 requires taxonomy and abundance table, phyloseq checks for the latter
  if(is.null(physeq@tax_table))
    stop("No taxonomy found in the phyloseq object and is required for ampvis2", call. = FALSE)
  
  #OTUs must be in rows, not columns
  if(phyloseq::taxa_are_rows(physeq))
    abund <- as.data.frame(phyloseq::otu_table(physeq)@.Data)
  else
    abund <- as.data.frame(t(phyloseq::otu_table(physeq)@.Data))
  
  #tax_table is assumed to have OTUs in rows too
  tax <- phyloseq::tax_table(physeq)@.Data
  
  #merge by rownames (OTUs)
  otutable <- merge(
    abund,
    tax,
    by = 0,
    all.x = TRUE,
    all.y = FALSE,
    sort = FALSE
  )
  colnames(otutable)[1] <- "OTU"
  
  #extract sample_data (metadata)
  if(!is.null(physeq@sam_data)) {
    metadata <- data.frame(
      phyloseq::sample_data(physeq),
      row.names = phyloseq::sample_names(physeq), 
      stringsAsFactors = FALSE, 
      check.names = FALSE
    )
    
    #check if any columns match exactly with rownames
    #if none matched assume row names are sample identifiers
    samplesCol <- unlist(lapply(metadata, function(x) {
      identical(x, rownames(metadata))}))
    
    if(any(samplesCol)) {
      #error if a column matched and it's not the first
      if(!samplesCol[[1]])
        stop("Sample ID's must be in the first column in the sample metadata, please reorder", call. = FALSE)
    } else {
      #assume rownames are sample identifiers, merge at the end with name "SampleID"
      if(any(colnames(metadata) %in% "SampleID"))
        stop("A column in the sample metadata is already named \"SampleID\" but does not seem to contain sample ID's", call. = FALSE)
      metadata$SampleID <- rownames(metadata)
      
      #reorder columns so SampleID is the first
      metadata <- metadata[, c(which(colnames(metadata) %in% "SampleID"), 1:(ncol(metadata)-1L)), drop = FALSE]
    }
  } else
    metadata <- NULL
  
  #extract phylogenetic tree, assumed to be of class "phylo"
  if(!is.null(physeq@phy_tree)) {
    tree <- phyloseq::phy_tree(physeq)
  } else
    tree <- NULL
  
  #extract OTU DNA sequences, assumed to be of class "XStringSet"
  if(!is.null(physeq@refseq)) {
    #convert XStringSet to DNAbin using a temporary file (easiest)
    fastaTempFile <- tempfile(pattern = "ampvis2_", fileext = ".fa")
    Biostrings::writeXStringSet(physeq@refseq, filepath = fastaTempFile)
  } else
    fastaTempFile <- NULL
  
  #load as normally with amp_load
  ampvis2::amp_load(
    otutable = otutable,
    metadata = metadata,
    tree = tree,
    fasta = fastaTempFile
  )
}

#variance stabilisation from DESeq2

ps_vst <- function(ps, factor){
  
  require(DESeq2)
  require(phyloseq)
  
  diagdds = phyloseq_to_deseq2(ps, as.formula(paste( "~", factor)))              
  diagdds = estimateSizeFactors(diagdds, type="poscounts")
  diagdds = estimateDispersions(diagdds, fitType = "local") 
  pst <- varianceStabilizingTransformation(diagdds)
  pst.dimmed <- t(as.matrix(assay(pst))) 
  # pst.dimmed[pst.dimmed < 0.0] <- 0.0
  ps.varstab <- ps
  otu_table(ps.varstab) <- otu_table(pst.dimmed, taxa_are_rows = FALSE) 
  return(ps.varstab)
}

#WGCNA visualisation
#result - list class object with attributes:
# ps - phyloseq object
# amp - ampwis2 object
# heat - heatmap with absalute read numbers
# heat_rel - hetmap with relative abundances
# tree - phylogenetic tree with taxonomy

color_filt <- function(ps, df){
  
      library(tidyverse)
      library(reshape2)
      library(gridExtra)

  
  l = list()
  for (i in levels(df$module)){
    message(i)
    color_name <-  df %>% 
      filter(module == i) %>% 
      pull(asv) %>% 
      unique()
    ps.col <- prune_taxa(color_name, ps)
    amp.col <- phyloseq_to_amp(ps.col)
    heat <- amp_heatmap(amp.col, tax_show = 60, 
              group_by = "Day", 
              tax_aggregate = "OTU",
              tax_add = "Genus", 
              normalise=FALSE, 
              showRemainingTaxa = TRUE)
    
    ps.rel  <-  phyloseq::transform_sample_counts(ps.col, function(x) x / sum(x) * 100)
    amp.r <- phyloseq_to_ampvis2(ps.rel)
    heat.rel <- amp_heatmap(amp.r, tax_show = 60, 
          group_by = "Day", 
          tax_aggregate = "OTU",
          tax_add = "Genus", 
          normalise=FALSE, 
          showRemainingTaxa = TRUE)
    
    tree <- ps.col@phy_tree
    taxa <- as.data.frame(ps.col@tax_table@.Data)
    p1 <- ggtree(tree) + 
              geom_tiplab(size=2, align=TRUE, linesize=.5) +
              theme_tree2()
    
    taxa[taxa == "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"] <- "Allorhizobium"
    taxa[taxa == "Burkholderia-Caballeronia-Paraburkholderia"] <- "Burkholderia"
  
    tx <- taxa %>% 
      rownames_to_column("id") %>% 
      mutate(id = factor(id, levels = rev(get_taxa_name(p1)))) %>% 
      dplyr::select(-c(Kingdom, Species, Order)) %>% 
      melt(id.var = 'id')

    p2 <- ggplot(tx, aes(variable, id)) + 
      geom_tile(aes(fill = value), alpha = 0.4) +
      geom_text(aes(label = value), size = 2) +
      theme_bw() + 
      theme(legend.position = "none",
             axis.ticks.x=element_blank(),
             axis.text.y=element_blank(),
             axis.ticks.y=element_blank(),
             axis.title.x = element_blank(),
            axis.title.y = element_blank()) 

    p <- ggpubr::ggarrange(p1, p2, widths = c(0.9, 1))
    l[[i]] <- list("ps" = ps.col, 
                   "amp" = amp.col,
                   "heat" = heat,
                   "heat_rel" = heat.rel,
                   "tree" = p)
                    
  }
  return(l)
}

Разделим датасет на две группы

1-я - в более чем 10% образцов должно быть хотя бы 10 ридов - эта группа пойдет в анализ далее

ps.inall <- phyloseq::filter_taxa(ps.f, function(x) sum(x > 10) > (0.1*length(x)), TRUE)
amp.inall <- phyloseq_to_ampvis2(ps.inall)
amp_heatmap(amp.inall,
            tax_show = 40,
            group_by = "Day",
            tax_aggregate = "OTU",
            tax_add = "Genus",
            normalise=FALSE,
            showRemainingTaxa = TRUE)

Вторая - оставшиеся, но более 100 ридов по всем образцам(далее - вылетающие мажоры(ВМ))
Остальные филотипы выкидываем из анализа

ps.exl  <- phyloseq::filter_taxa(ps.f, function(x) sum(x > 10) < (0.1*length(x)), TRUE)
ps.exl  <- prune_taxa(taxa_sums(ps.exl) > 100, ps.exl)
amp.exl <- phyloseq_to_ampvis2(ps.exl)
amp_heatmap(amp.exl,
            tax_show = 40,
            group_by = "Day",
            tax_aggregate = "OTU",
            tax_add = "Genus",
            normalise=FALSE,
            showRemainingTaxa = TRUE)

То же, но c относительной представленностью.

ps.per  <-  phyloseq::transform_sample_counts(ps.f, function(x) x / sum(x) * 100) 
ps.exl.taxa <- taxa_names(ps.exl)
ps.per.exl <- prune_taxa(ps.exl.taxa, ps.per)
amp.exl.r <- phyloseq_to_ampvis2(ps.per.exl)

amp_heatmap(amp.exl.r, tax_show = 60, 
      group_by = "Day", 
      tax_aggregate = "OTU",
      tax_add = "Genus", 
      round =  2, 
      normalise=FALSE, 
      showRemainingTaxa = TRUE)

amp_heatmap(amp.exl.r, tax_show = 60, 
      group_by = "Day", 
      tax_aggregate = "OTU",
      tax_add = "Genus", 
      round =  2, 
      normalise=FALSE, )

ВМ объединенные по родам. Относительная представленность.

amp_heatmap(amp.exl.r,
            tax_show = 30, 
            group_by = "Day", 
            tax_aggregate = "Genus",
            tax_add = "Phylum",
            tax_class = "Proteobacteria",
            round =  2,
            normalise=FALSE, 
            showRemainingTaxa = TRUE)

Если посмотреть, если как распределяются ВМ по дням - похоже распределение ВМ связано не только с особенностью отдельных мешочков, но и с чисто техническими особенностями - вылетающие значения вылетают в основном не с биологическими повторностями, а с техническими(красная линия)

p_box <- phyloseq::sample_sums(ps.per.exl) %>% 
  as.data.frame(col.names = "values") %>% 
  setNames(., nm = "values") %>% 
  rownames_to_column("samples") %>% 
  mutate(Day = sapply(strsplit(samples, "-"), `[`, 3)) %>% 
  ggplot(aes(x=Day, y=values, color=Day, fill = Day)) +
  geom_boxplot(aes(color=Day, fill = Day)) +
  geom_point(color = "black", position = position_dodge(width=0.2)) +
  geom_hline(yintercept = 10, colour = "red") +
  theme_bw() +
  theme(legend.position = "none")
     
p_box <- p_box + viridis::scale_color_viridis(option = "H", discrete = TRUE, direction=1, begin=0.1, end = 0.9, alpha = 0.5)

p_box + viridis::scale_fill_viridis(option = "H", discrete = TRUE, direction=1, begin=0.1, end = 0.9, alpha = 0.3)

0.2.1 WGCNA

после clr нормализации

otu.inall <- as.data.frame(ps.inall@otu_table@.Data) 
ps.inall.clr <- ps.inall
otu_table(ps.inall.clr) <- phyloseq::otu_table(compositions::clr(otu.inall), taxa_are_rows = FALSE)
data <- ps.inall.clr@otu_table@.Data %>% 
  as.data.frame()
rownames(data) <- as.character(ps.inall.clr@sam_data$Description)
powers <-  c(c(1:10), seq(from = 12, to=30, by=1))
sft <-  pickSoftThreshold(data, powerVector = powers, verbose = 5, networkType = "signed")
## pickSoftThreshold: will use block size 338.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 338 of 338
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k.  max.k.
## 1      1    0.185  23.60         0.8790 168.0000 168.00000 172.000
## 2      2    0.199 -15.50         0.8970  87.6000  87.20000  95.100
## 3      3    0.518 -13.10         0.8580  47.3000  46.80000  55.900
## 4      4    0.390  -6.46         0.8050  26.5000  26.00000  34.500
## 5      5    0.262  -3.46         0.7820  15.4000  14.90000  22.300
## 6      6    0.232  -2.30         0.7770   9.2300   8.78000  15.000
## 7      7    0.251  -1.87         0.6990   5.7300   5.36000  10.500
## 8      8    0.319  -1.75         0.7570   3.6800   3.41000   7.620
## 9      9    0.527  -2.07         0.7770   2.4400   2.20000   5.890
## 10    10    0.648  -2.28         0.8490   1.6600   1.47000   4.690
## 11    12    0.827  -2.41         0.8970   0.8420   0.69300   3.190
## 12    13    0.793  -2.38         0.7930   0.6220   0.49000   2.700
## 13    14    0.277  -3.97         0.0914   0.4690   0.35500   2.320
## 14    15    0.282  -3.83         0.1000   0.3610   0.26100   2.010
## 15    16    0.894  -2.17         0.9030   0.2820   0.19800   1.770
## 16    17    0.905  -2.10         0.9080   0.2240   0.15000   1.560
## 17    18    0.922  -2.04         0.9240   0.1810   0.11500   1.390
## 18    19    0.927  -2.01         0.9230   0.1480   0.08820   1.250
## 19    20    0.938  -1.93         0.9370   0.1220   0.06780   1.130
## 20    21    0.946  -1.86         0.9470   0.1020   0.05260   1.020
## 21    22    0.315  -3.12         0.2220   0.0856   0.04140   0.926
## 22    23    0.303  -3.96         0.1560   0.0728   0.03270   0.844
## 23    24    0.307  -2.92         0.1940   0.0623   0.02590   0.772
## 24    25    0.303  -2.84         0.1800   0.0538   0.02060   0.708
## 25    26    0.921  -1.63         0.8990   0.0467   0.01650   0.657
## 26    27    0.892  -1.61         0.8650   0.0408   0.01320   0.622
## 27    28    0.155  -1.93        -0.0202   0.0359   0.01040   0.591
## 28    29    0.199  -2.77        -0.0228   0.0317   0.00841   0.563
## 29    30    0.199  -2.69        -0.0203   0.0282   0.00684   0.537
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=0.9,col="red")
abline(h=0.9,col="salmon")

после vst нормализации

ps.varstab <- ps_vst(ps.inall, "Day")
data2 <- ps.varstab@otu_table@.Data %>% 
  as.data.frame()
rownames(data2) <- as.character(ps.varstab@sam_data$Description)
powers <-  c(seq(from = 1, to=10, by=0.5), seq(from = 11, to=20, by=1))
sft2 <-  pickSoftThreshold(data2, powerVector = powers, verbose = 5, networkType = "signed hybrid")
## pickSoftThreshold: will use block size 338.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 338 of 338
##    Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1    1.0    0.409 -1.05          0.704  50.100  45.60000  97.80
## 2    1.5    0.578 -1.07          0.895  30.900  27.00000  71.20
## 3    2.0    0.723 -1.15          0.942  20.300  17.90000  53.60
## 4    2.5    0.767 -1.24          0.871  14.000  12.00000  41.70
## 5    3.0    0.809 -1.31          0.909  10.000   7.86000  33.40
## 6    3.5    0.872 -1.31          0.958   7.380   5.39000  27.30
## 7    4.0    0.888 -1.30          0.968   5.580   3.76000  22.60
## 8    4.5    0.858 -1.36          0.910   4.320   2.67000  19.00
## 9    5.0    0.892 -1.36          0.943   3.400   1.94000  16.20
## 10   5.5    0.899 -1.38          0.954   2.720   1.41000  13.90
## 11   6.0    0.896 -1.39          0.962   2.210   1.07000  12.10
## 12   6.5    0.897 -1.33          0.962   1.820   0.84700  10.60
## 13   7.0    0.910 -1.30          0.970   1.520   0.65400   9.30
## 14   7.5    0.914 -1.31          0.966   1.280   0.51500   8.25
## 15   8.0    0.913 -1.28          0.954   1.090   0.41500   7.36
## 16   8.5    0.905 -1.28          0.937   0.938   0.34000   6.59
## 17   9.0    0.895 -1.30          0.913   0.813   0.27800   5.99
## 18   9.5    0.878 -1.33          0.900   0.709   0.23200   5.49
## 19  10.0    0.886 -1.34          0.894   0.624   0.19100   5.07
## 20  11.0    0.911 -1.31          0.928   0.491   0.12500   4.35
## 21  12.0    0.890 -1.29          0.894   0.396   0.08590   3.79
## 22  13.0    0.910 -1.26          0.918   0.325   0.06120   3.33
## 23  14.0    0.882 -1.25          0.856   0.272   0.04420   2.95
## 24  15.0    0.889 -1.19          0.864   0.230   0.03230   2.63
## 25  16.0    0.913 -1.18          0.893   0.198   0.02240   2.35
## 26  17.0    0.930 -1.16          0.912   0.172   0.01620   2.20
## 27  18.0    0.955 -1.19          0.947   0.151   0.01190   2.09
## 28  19.0    0.908 -1.21          0.896   0.134   0.00896   1.99
## 29  20.0    0.916 -1.21          0.906   0.120   0.00649   1.91
plot(sft2$fitIndices[,1], -sign(sft2$fitIndices[,3])*sft2$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"))
text(sft2$fitIndices[,1], -sign(sft2$fitIndices[,3])*sft2$fitIndices[,2], labels=powers,cex=0.9,col="red")
abline(h=0.9,col="salmon")

0.2.1.1 vst

ordination

net <- blockwiseModules(data, power=3, TOMType="unsigned", networkType="signed")
modules_of_interest <-  c("green", "turquoise", "tan")
mergedColors <-  labels2colors(net$colors)
modules_of_interest <-  mergedColors2 %>% unique()
df <- module_df %>% 
  rename("id" = "asv")
df <- df %>% 
  dplyr::select(-"id") %>% 
  mutate(colors = as.factor(colors))

ps.inall.col <- ps.inall
tx <- cbind(taxa, df)
tax_table(ps.inall.col) <- tax_table(as.matrix(tx))
ord <- ordinate(ps.inall.col, "NMDS", "unifrac")
plot_ordination(ps.inall.col, ord, type = "species", color = "colors") + 
  scale_color_manual(values = c("blue", "brown", "green", "salmon",  "turquoise")) +
  theme_bw()

ordination chunk

ps.inall.col <- ps.inall
tx <- cbind(taxa, df)
ps.inall.col@tax_table
tax_table(ps.inall.col) <- tax_table(as.matrix(tx))
ord <- ordinate(ps.inall.col, "NMDS", "unifrac")
plot_ordination(ps.inall.col, ord, type = "species", color = "colors") + 
  scale_color_manual(values = c("blue", "brown", "green", "salmon",  "turquoise")) +
  theme_bw()